Add .tar.gz / .tgz archive support for extension, preset, and workflow installation#2395
Add .tar.gz / .tgz archive support for extension, preset, and workflow installation#2395Copilot wants to merge 23 commits into
Conversation
…installation Agent-Logs-Url: https://github.com/github/spec-kit/sessions/9fb9a8ea-0967-4baf-b95c-7101e423ff58 Co-authored-by: mnriem <15701806+mnriem@users.noreply.github.com>
Agent-Logs-Url: https://github.com/github/spec-kit/sessions/9fb9a8ea-0967-4baf-b95c-7101e423ff58 Co-authored-by: mnriem <15701806+mnriem@users.noreply.github.com>
Agent-Logs-Url: https://github.com/github/spec-kit/sessions/9fb9a8ea-0967-4baf-b95c-7101e423ff58 Co-authored-by: mnriem <15701806+mnriem@users.noreply.github.com>
Agent-Logs-Url: https://github.com/github/spec-kit/sessions/9fb9a8ea-0967-4baf-b95c-7101e423ff58 Co-authored-by: mnriem <15701806+mnriem@users.noreply.github.com>
Agent-Logs-Url: https://github.com/github/spec-kit/sessions/9fb9a8ea-0967-4baf-b95c-7101e423ff58 Co-authored-by: mnriem <15701806+mnriem@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
| # On older versions pass only the pre-validated members so that no | ||
| # unvetted entry (added concurrently or via a race) slips through. | ||
| if sys.version_info >= (3, 12): | ||
| tf.extractall(dest_dir, filter="data") # type: ignore[call-arg] |
|
@copilot apply changes based on the comments in this thread |
Fixed in commit — the Python 3.12+ branch now calls |
| # Detect archive format from URL; resolve via Content-Type when needed. | ||
| # `final_url` may differ from `download_url` if the server redirects. | ||
| archive_fmt = detect_archive_format(download_url) | ||
|
|
||
| # Download the archive | ||
| try: | ||
| with self._open_url(download_url, timeout=60) as response: | ||
| zip_data = response.read() | ||
|
|
||
| zip_path.write_bytes(zip_data) | ||
| return zip_path | ||
| final_url = response.geturl() | ||
| if not archive_fmt: | ||
| content_type = response.headers.get("Content-Type", "") | ||
| archive_fmt = detect_archive_format(final_url, content_type) | ||
| archive_data = response.read() |
| # Detect archive format from URL; resolve via Content-Type when needed. | ||
| # `final_url` may differ from `download_url` if the server redirects. | ||
| archive_fmt = detect_archive_format(download_url) | ||
|
|
||
| try: | ||
| with self._open_url(download_url, timeout=60) as response: | ||
| zip_data = response.read() | ||
|
|
||
| zip_path.write_bytes(zip_data) | ||
| return zip_path | ||
| final_url = response.geturl() | ||
| if not archive_fmt: | ||
| content_type = response.headers.get("Content-Type", "") | ||
| archive_fmt = detect_archive_format(final_url, content_type) | ||
| archive_data = response.read() |
| with tempfile.TemporaryDirectory() as tmpdir: | ||
| zip_path = Path(tmpdir) / "preset.zip" | ||
| archive_fmt = _det_fmt(from_url) | ||
| final_url = from_url | ||
| try: | ||
| with urllib.request.urlopen(from_url, timeout=60) as response: | ||
| zip_path.write_bytes(response.read()) | ||
| final_url = response.geturl() | ||
| if not archive_fmt: | ||
| content_type = response.headers.get("Content-Type", "") | ||
| archive_fmt = _det_fmt(final_url, content_type) | ||
| archive_data = response.read() |
| archive_fmt = detect_archive_format(from_url) | ||
| archive_path = None | ||
|
|
||
| try: | ||
| with urllib.request.urlopen(from_url, timeout=60) as response: | ||
| zip_data = response.read() | ||
| zip_path.write_bytes(zip_data) | ||
| final_url = response.geturl() | ||
| if not archive_fmt: | ||
| content_type = response.headers.get("Content-Type", "") | ||
| archive_fmt = detect_archive_format(final_url, content_type) | ||
| archive_data = response.read() |
|
@copilot apply changes based on the comments in this thread |
Applied the change in all 4 URL-download paths ( Format detection now runs against the post-redirect |
| # post-redirect URL first (with Content-Type fallback); only use the | ||
| # original `download_url` as a last hint if the final URL gives no | ||
| # signal. | ||
| final_url = download_url |
| # Determine the archive format from the post-redirect URL first | ||
| # (with Content-Type fallback); only use the original `download_url` | ||
| # as a last hint if the final URL gives no signal. | ||
| final_url = download_url |
Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
There was a problem hiding this comment.
Copilot's findings
Comments suppressed due to low confidence (1)
src/specify_cli/init.py:3679
final_urlis checked for HTTPS only afterarchive_data = response.read(). This means an insecure redirect can still be fetched before you abort. To make the scheme-downgrade guard effective, validate the scheme right afterresponse.geturl()and before reading the response body (or disallow non-HTTPS redirects in the opener).
with urllib.request.urlopen(from_url, timeout=60) as response:
final_url = response.geturl()
content_type = response.headers.get("Content-Type", "")
archive_fmt = detect_archive_format(final_url, content_type)
if not archive_fmt:
archive_fmt = detect_archive_format(from_url)
archive_data = response.read()
- Files reviewed: 6/6 changed files
- Comments generated: 3
| with self._open_url(download_url, timeout=60) as response: | ||
| zip_data = response.read() | ||
|
|
||
| zip_path.write_bytes(zip_data) | ||
| return zip_path | ||
| final_url = response.geturl() | ||
| content_type = response.headers.get("Content-Type", "") | ||
| archive_fmt = detect_archive_format(final_url, content_type) | ||
| if not archive_fmt: | ||
| archive_fmt = detect_archive_format(download_url) | ||
| archive_data = response.read() |
| with self._open_url(download_url, timeout=60) as response: | ||
| zip_data = response.read() | ||
|
|
||
| zip_path.write_bytes(zip_data) | ||
| return zip_path | ||
| final_url = response.geturl() | ||
| content_type = response.headers.get("Content-Type", "") | ||
| archive_fmt = detect_archive_format(final_url, content_type) | ||
| if not archive_fmt: | ||
| archive_fmt = detect_archive_format(download_url) | ||
| archive_data = response.read() | ||
|
|
| with urllib.request.urlopen(from_url, timeout=60) as response: | ||
| zip_path.write_bytes(response.read()) | ||
| final_url = response.geturl() | ||
| content_type = response.headers.get("Content-Type", "") | ||
| # Prefer the post-redirect URL for format detection; | ||
| # fall back to the original URL only as a last hint. | ||
| archive_fmt = _det_fmt(final_url, content_type) | ||
| if not archive_fmt: | ||
| archive_fmt = _det_fmt(from_url) | ||
| archive_data = response.read() |
|
@copilot apply changes based on the comments in this thread |
Applied in 5d75366: in all four URL-download paths ( |
| archive_fmt = detect_archive_format(final_url) | ||
| if not archive_fmt: | ||
| content_type = resp.headers.get("Content-Type", "") | ||
| archive_fmt = detect_archive_format(final_url, content_type) |
| # Detect archive format from the final URL or Content-Type header. | ||
| cat_archive_fmt = detect_archive_format(final_url) | ||
| if not cat_archive_fmt: | ||
| cat_ct = response.headers.get("Content-Type", "") | ||
| cat_archive_fmt = detect_archive_format(final_url, cat_ct) |
| # Extract ZIP safely (prevent Zip Slip attack) | ||
| with zipfile.ZipFile(zip_path, 'r') as zf: | ||
| # Validate all paths first before extracting anything | ||
| temp_path_resolved = temp_path.resolve() | ||
| for member in zf.namelist(): | ||
| member_path = (temp_path / member).resolve() | ||
| # Use is_relative_to for safe path containment check | ||
| try: | ||
| member_path.relative_to(temp_path_resolved) | ||
| except ValueError: | ||
| raise ValidationError( | ||
| f"Unsafe path in ZIP archive: {member} (potential path traversal)" | ||
| ) | ||
| # Only extract after all paths are validated | ||
| zf.extractall(temp_path) |
| with zipfile.ZipFile(zip_path, 'r') as zf: | ||
| temp_path_resolved = temp_path.resolve() | ||
| for member in zf.namelist(): | ||
| member_path = (temp_path / member).resolve() | ||
| try: | ||
| member_path.relative_to(temp_path_resolved) | ||
| except ValueError: | ||
| raise PresetValidationError( | ||
| f"Unsafe path in ZIP archive: {member} " | ||
| "(potential path traversal)" | ||
| ) | ||
| zf.extractall(temp_path) |
The extension, preset, and workflow download pipelines only accepted ZIP archives, blocking use of npm registries and CI artifact stores that serve tarballs natively.
Core utilities (
extensions.py)detect_archive_format(url, content_type="")— infers format from URL path extension (.zip,.tar.gz,.tgz) withContent-Typeheader fallback (application/gzip,application/x-gzip,application/x-tar+gzip)safe_extract_tarball(archive_path, dest_dir, error_class)— safe extraction with:..traversal, symlinks, hard links, devices, and FIFOsXHDTYPE,XGLTYPE,SOLARIS_XHDTYPE) and GNU metadata-only entries (GNUTYPE_LONGNAME,GNUTYPE_LONGLINK) are silently skipped — they carry no extractable payload and are emitted by many common archiving tools;GNUTYPE_SPARSEis intentionally not skipped because sparse entries carry a real file payload andisreg()returnsTruefor themsafe_memberslist toextractall()to ensure only vetted entries are extractedtarfile.data_filterfor extra OS-level protection, combined withmembers=safe_memberstarfile.TarError/OSErrorare caught and re-raised as the caller-suppliederror_classfor consistent error handlingBoth helpers are public (no underscore prefix) and imported directly by
presets.pyand__init__.py.Extensions & presets
install_from_zip()on both managers now detects archive format from the file extension and dispatches to ZIP or tarball extraction accordingly — existing callers are unaffecteddownload_extension()/download_pack()captureresponse.geturl()as the canonical post-redirect URL and detect archive format from the final URL first (withContent-Typefallback), only consulting the originaldownload_urlas a last hint. This ensures redirects that change the archive extension or type are honored when choosing the on-disk suffix and dispatching to the correct extractor. Unknown formats are rejected with a clear error rather than silently defaulting to ZIP, and the final URL's scheme is re-validated to guard against scheme-downgrade via redirects.__init__.pycall sitesextension add --fromfinal_urlfirst (Content-Type fallback), then original URL as last hint; unknown formats rejected; HTTPS re-checked on post-redirect URLpreset add --fromextension updatePath(extension).nameto prevent path traversalworkflow add(URL)workflow.ymlfrom archive when URL points to one; temp-file paths initialized before write to avoidUnboundLocalErroron disk-fullworkflow add(local).tar.gz/.tgz/.ziparchive files (case-insensitive detection)workflow add(catalog)A shared
_extract_workflow_yml(archive_path, fmt)helper handles root-level and single-nested-directory layouts for both formats, withtarfile.extractfile()handles properly closed via context managers.Tests
30 new tests across
test_extensions.py,test_presets.py, andtest_workflows.pycovering:test_extensions.py: format detection (URL + Content-Type), flat and nested tarball install, missing manifest errors, path traversal rejection, symlink rejectiontest_presets.py: flat and nested tarball install, missing manifest errors, path traversal rejection, symlink rejectiontest_workflows.py(TestWorkflowAddArchive, 9 CLI-level tests): local ZIP (flat/nested), local.tar.gz(flat/nested),.tgzalias, missingworkflow.ymlerror cases, URL-based archive download for both ZIP and tar.gz formats